# train data
data = read.csv('../data/train.csv')
data = data[-1]
# Mahalanobis distance 2D
for (i in 1:9) {
  name1 = colnames(data)[i]
  for (j in (i+1):10) {
    name2 = colnames(data)[j]
    hw = data.frame(name1=data[, i], name2=data[, j])
    ord = order(mahalanobis(hw, colMeans(hw), cov(hw)), decreasing=T)
    outlier = rep(F, nrow(hw))
    outlier[ord[1:100]] = T
    pch = outlier * 16
    plot(hw, pch=pch, xlab=name1, ylab=name2)
  }
}

# Mahalanobis distance 3D
library(rgl)
name1 = colnames(data)[1]
name2 = colnames(data)[2]
name3 = colnames(data)[3]
hw = data.frame(name1=data[, 1], name2=data[, 2], name3=data[, 3])
ord = order(mahalanobis(hw, colMeans(hw), cov(hw)), decreasing=T)
outlier = rep(F, nrow(hw))
outlier[ord[1:100]] = T
col = outlier + 1
plot3d(hw$name1, hw$name2, hw$name3, type='s', col=col, xlab=name1, ylab=name2, zlab=name3)
# autoencoders, init h2o and train/save model
#library(h2o)
#h2o.init(nthreads = -1)
#data_hf = as.h2o(data)
#features = setdiff(colnames(data_hf), "Cover_Type")
#model_nn = h2o.deeplearning(x = features, 
#                            training_frame = data_hf, 
#                            model_id = 'autoencoder',
#                            autoencoder = T,
#                            ignore_const_cols = F,
#                            seed = 57,
#                            hidden = c(25, 10, 25),
#                            epochs = 100,
#                            activation = 'Tanh')
#h2o.saveModel(model_nn, path='autoencoder', force = T)
#model_nn
# load Autoencoder model and identify outliers
library(h2o)
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
h2o.init(nthreads = -1)
## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\DjoksPC\AppData\Local\Temp\Rtmp0kL6pp/h2o_Djok_started_from_r.out
##     C:\Users\DjoksPC\AppData\Local\Temp\Rtmp0kL6pp/h2o_Djok_started_from_r.err
## 
## 
## Starting H2O JVM and connecting: . Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         3 seconds 40 milliseconds 
##     H2O cluster timezone:       EET 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.22.1.1 
##     H2O cluster version age:    2 months and 5 days  
##     H2O cluster name:           H2O_started_from_R_Djok_bti730 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.76 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.5.2 (2018-12-20)
data_hf = as.h2o(data)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
model_nn = h2o.loadModel("autoencoder/autoencoder")
anomaly = h2o.anomaly(model_nn, data_hf) %>%
  as.data.frame() %>%
  tibble::rownames_to_column() %>%
  mutate(Cover_Type = as.vector(data$Cover_Type))

mean_mse = anomaly %>%
  group_by(Cover_Type) %>%
  summarise(mean = mean(Reconstruction.MSE))

ggplot(anomaly, aes(x = as.numeric(rowname), y = Reconstruction.MSE, color = as.factor(Cover_Type))) +
  geom_point(alpha = 0.3) +
  geom_hline(data = mean_mse, aes(yintercept = mean, color = as.factor(Cover_Type))) +
  scale_color_brewer(palette = "Set1") +
  labs(x = "instance number",
       color = "Cover_Type")

# LOF
library(dbscan)
for (i in 1:9) {
  name1 = colnames(data)[i]
  for (j in (i+1):10) {
    name2 = colnames(data)[j]
    lof_data = data.frame(name1 = data[, i], name2 = data[, j])
    loff = lof(lof_data, k = 5)
    plot(lof_data, pch = ".", main = "LOF (k=3)", xlab=name1, ylab=name2)
    points(lof_data, cex = (loff-1)*3, pch = 1, col="red")
    text(lof_data[loff>2,], labels = round(loff, 1)[loff>2], pos = 3)
  }
}

# Isolation Forest
# library(devtools)
# install_github("Zelazny7/isofor")
library(isofor)
data = read.csv('../data/train.csv')
data = data[-1]

for (i in 1:9) {
  name1 = colnames(data)[i]
  for (j in (i+1):10) {
    name2 = colnames(data)[j]
    hw = data.frame(name1=data[, i], name2=data[, j])
    mod = iForest(X = hw, 200, 32)
    p = predict(mod, hw)
    col = ifelse(p > quantile(p, 0.95), "red", "blue")
    plot(hw, col=col, xlab=name1, ylab=name2)
  }
}

library(mrfDepth)
data = read.csv('../data/train.csv')
# data = data[-1]
selected_data = data[, c('Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 
                         'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 
                         'Hillshade_9am', 'Hillshade_Noon', 'Horizontal_Distance_To_Fire_Points')
                     ]

result = outlyingness(x = selected_data)
outliers = which(!result$flagX)
plot(selected_data)
head(selected_data$outliers)
## NULL
points(selected_data[outliers,], col = "red")